setup

library(tidyverse)
library(magrittr)
library(limma)
library(edgeR)
library(fgsea)
library(ggfortify)
library(ggrepel)
library(goseq)
library(cqn)
library(scales)
library(harmonicmeanp)
library(readxl)
library(gridExtra)
library(grid)
library(AnnotationHub)
library(ensembldb)
library(msigdbr)

# Visualisation
library(ggpubr)
library(pheatmap)
library(UpSetR)
library(kableExtra)
library(pander)
library(rtracklayer)
library(RColorBrewer)
library(cowplot)
library(ggeasy)
library(pathview)
library(ggsci)

# We will use a function by Ben Marwick for raincloud plots
# This code loads the function in the working environment
source("https://gist.githubusercontent.com/benmarwick/2a1bb0133ff568cbe28d/raw/fb53bd97121f7f9ce947837ef1a4c65a73bffb3f/geom_flat_violin.R")

annotations

ah <- AnnotationHub() %>%
    subset(species == "Mus musculus") %>%
    subset(rdataclass == "EnsDb")

ens98 <- ah[["AH75036"]] # for release 98
ex <- exonsBy(ens98, "tx")
tr <- transcripts(ens98)
gn <- genes(ens98)

# get the lengths of transcripts from the ensdb object
tr$len <- ex %>%
    width %>%
    lapply(sum) %>%
    .[names(tr)] %>%
    unlist()

# also get the mean GC (weighted by transcript length) and the mean length of alternate transcripts per gene. 
gc_len <- mcols(tr) %>%
    as.data.frame() %>%
    group_by(gene_id) %>%
    summarise(
        n = n(),
        meanGC = sum(len*gc_content) / sum(len),
        len = mean(len)
    ) %>% 
  left_join(gn %>% as.data.frame() %>% dplyr::select(gene_id, gene_name, gene_biotype, description, seqnames, entrezid)) %>% 
  column_to_rownames("gene_id")

Introduction

counts_zhaeoetal <- read_tsv("data/APOE-TR/04_featurecounts_zhaoetal/featureCounts_q10_ensRelease98.txt", skip = 1) %>% 
  dplyr::select(-c(Chr, Start, End, Strand, Length)) %>% 
  column_to_rownames("Geneid")

# tidy up column names
colnames(counts_zhaeoetal) %<>% 
  str_remove(pattern = "Aligned.sortedByCoord.out.bam") %>% 
  str_remove(pattern = "^[:alnum:]+_") %>% 
  str_remove(pattern = ".FCH[:alnum:]+_[:alnum:]+_[:alnum:]+")

get the metadata about the mice

This information was obtained from synapse. The metadata about the mice is over 2 spreadsheets. So i need to join them together.

meta_zhaoetal <- read_csv("data/APOE-TR/APOE-TR_biospecimen_metadata.csv") %>% 
  dplyr::filter(specimenID %in% colnames(counts_zhaeoetal)) %>% 
  dplyr::select(individualID, specimenID) %>% 
  left_join(
    read_csv("data/APOE-TR/APOE-TR _mouse_metadata.csv")
  ) %>% 
  mutate(
    Genotype = factor(genotype, levels = c("APOE3", "APOE2", "APOE4")), 
    Sex = factor(sex), 
    Age = case_when(
     specimenID = grepl(specimenID, pattern = "3M") ~ "3M",
     individualID = grepl(individualID, pattern = "12M") ~ "12M", 
     individualID = grepl(individualID, pattern = "24M") ~ "24M"
    ), 
    sample = specimenID
  ) %>%
  mutate(Age = factor(Age, levels = c("3M", "12M", "24M"))) %>%
  mutate(
    group = paste0(Genotype, "_", Age, "_", Sex) %>% as.factor(), 
    group_agebyGenotype = paste0(Genotype, "_", Age) %>% as.factor(), 
    litter = str_replace_all(dateBirth, pattern = "/", replacement = ".")
    ) %>% 
  dplyr::select(-age, - genotype, - sex) %>% 
  column_to_rownames("specimenID")

# extract the seq lane info from colnaqmes of featurecounts matrix and add to metadata
lanes <- read_tsv("data/APOE-TR/04_featurecounts_zhaoetal/featureCounts_q10_ensRelease98.txt", 
         skip = 1) %>% 
  colnames() %>% 
  as_tibble() %>% 
  dplyr::slice(-(1:6)) %>% 
  set_colnames("sample") %>% 
  mutate(lane = str_extract(sample, pattern ="L\\d"), 
         sample = str_remove(sample, pattern = "Aligned.sortedByCoord.out.bam"), 
         sample = str_remove(sample, pattern = "^[:alnum:]+_"), 
         sample = str_remove(sample, pattern = "\\.FCH.+")
  )

meta_zhaoetal %<>% 
  left_join(lanes)

check sex by expression of genes on sex chromosomes

In mice, only males contain the Y chromosome. Therefore, this allows me to confirm the sex of the samples by examining whether genes on the Y chromosome are expressed in the brains. Here, I will extract the genes from the Y chromosome which were detected in the RNA-seq experiment, and determine whether they are only expressed in males. Note that only 4 genes were detected.

yGenes <- 
  gc_len %>% 
  as.data.frame() %>% 
  rownames_to_column("gene_id") %>%
  dplyr::filter(seqnames == "Y") %>% 
  .$gene_id

beforecorrect_sex <- 
  counts_zhaeoetal %>% 
  as.data.frame() %>% 
  .[yGenes,] %>% 
   .[rowSums(cpm(.) >= 1) >= 47,] %>% 
  rownames_to_column("gene_id") %>% 
  gather(key = "sample", value = "counts", starts_with("A")) %>% 
  left_join(meta_zhaoetal) %>% 
  dplyr::filter(Age == "3M") %>% 
    left_join(gc_len %>% 
                as.data.frame() %>% 
                rownames_to_column("gene_id") %>%  
                dplyr::select(gene_id, gene_name)) %>% 
    dplyr::select(gene_name, sample, Sex, counts) %>% 
    ggplot(aes(x = sample, y = counts, fill = Sex)) +
    geom_boxplot(outlier.shape = NA, alpha = 0.8) +
    facet_wrap(~Sex, ncol = 1, scales = "free") +
  theme_bw() +
  easy_rotate_x_labels(angle = 315) +
  theme(legend.position = "none")

beforecorrect_sex
Distribution of counts assigned to genes on the Y chromosom

Distribution of counts assigned to genes on the Y chromosom

Sample APOE_3M_21 appears to be a male sample, and samples APOE_3M_30 and APOE_3M_7 appear to be female. Therefore, I will update the meta object to reflect this, Now all samples appeaar correct.

meta_zhaoetal %<>% 
  mutate(Sex = as.character(Sex), 
         Sex = case_when(
    sample == "APOE_3M_21"  ~ "male", 
    sample == "APOE_3M_30" ~ "female", 
    sample == "APOE_3M_7" ~ "female", 
    TRUE ~ Sex
  ), 
  Sex = as.factor(Sex)) 


# Then fix the group factor in the metadata spreadhset

meta_zhaoetal %<>% 
  as.data.frame() %>% 
  rownames_to_column("Sample") %>% 
   mutate(
    group = paste0(Genotype, "_", Age, "_", Sex) %>% as.factor(), 
    group_agebyGenotype = paste0(Genotype, "_", Age) %>% as.factor()
    ) %>% 
  column_to_rownames("Sample")

ggarrange(beforecorrect_sex,
          counts_zhaeoetal %>% 
            as.data.frame() %>% 
            .[yGenes,] %>% 
            .[rowSums(cpm(.) >= 1) >= 47,] %>% 
            rownames_to_column("gene_id") %>% 
            gather(key = "sample", value = "counts", starts_with("A")) %>% 
            left_join(meta_zhaoetal) %>% 
            dplyr::filter(Age == "3M") %>% 
            left_join(gc_len %>% 
                        as.data.frame() %>% 
                        rownames_to_column("gene_id") %>%  
                        dplyr::select(gene_id, gene_name)) %>% 
            dplyr::select(gene_name, sample, Sex, counts) %>% 
            ggplot(aes(x = sample, y = counts, fill = Sex)) +
            geom_boxplot(outlier.shape = NA, alpha = 0.8) +
            facet_wrap(~Sex, ncol = 1, scales = "free") +
            theme_bw() +
            easy_rotate_x_labels(angle = 315) +
            theme(legend.position = "none"), 
          labels = "AUTO", 
          ncol = 1
) 

  #ggsave("plotsforpub/checksex.png", width = 18, height = 10, units = "cm", dpi = 800, scale = 1.5)

filter lowly expressed genes and make DGE

Only the 3m samples are retained.

x_zhaoetal<-
  counts_zhaeoetal %>% 
  dplyr::select(meta_zhaoetal %>% 
                  dplyr::filter(Age == "3M") %>% 
                  .$sample) %>% 
  .[rowSums(cpm(.) >= 2) >= 8,] %>% 
  DGEList(
    samples = tibble(sample = colnames(.)) %>%
      left_join(meta_zhaoetal), 
    genes = gc_len[rownames(.),]
  ) %>% 
  calcNormFactors("TMM") %>% 
  estimateDisp()
## Design matrix not provided. Switch to the classic mode.
a <- 
  cpm(counts_zhaeoetal, log = TRUE) %>%
  as.data.frame() %>%
  pivot_longer(
    cols = everything(),
    names_to = "sample",
    values_to = "logCPM"
  ) %>% 
    split(f = .$sample) %>%
  lapply(function(y){
    d <- density(y$logCPM)
    tibble(
      sample = unique(y$sample),
      x = d$x,
      y = d$y
    )
  }) %>%
  bind_rows() %>%
  left_join(x_zhaoetal$samples) %>% 
    dplyr::filter(Age == "3M") %>% 
  ggplot(aes(x, y, colour = Genotype, group = sample)) +
  geom_line() +
  labs(
    x = "logCPM",
    y = "Density",
    colour = "Genotype"
  ) +
  scale_color_viridis_d(option = "plasma", end = 0.8) +
  ggtitle("Before filtering") +
  theme_bw()


ggarrange(a, 
           cpm(x_zhaoetal, log = TRUE) %>%
             as.data.frame() %>%
             pivot_longer(
               cols = everything(),
               names_to = "sample",
               values_to = "logCPM"
             ) %>% 
             split(f = .$sample) %>%
             lapply(function(y){
               d <- density(y$logCPM)
               tibble(
                 sample = unique(y$sample),
                 x = d$x,
                 y = d$y
               )
             }) %>%
             bind_rows() %>%
             left_join(x_zhaoetal$samples) %>%
             ggplot(aes(x, y, colour = Genotype, group = sample)) +
            geom_line() +
            labs(
              x = "logCPM",
              y = "Density",
               colour = "Genotype"
             ) +
             scale_color_viridis_d(option = "plasma", end = 0.8) +
             ggtitle("After filtering") +
             theme_bw(), 
           common.legend = TRUE, 
          labels = "AUTO") 

#ggsave("plotsforpub/filtering.png", width = 15, height = 5, units = "cm", scale = 1.4)

PCA on entire dataset

ggarrange(
  x_zhaoetal %>% 
    cpm(log = TRUE) %>% 
    t() %>% 
    prcomp() %>%
    autoplot(data = tibble(sample = rownames(.$x)) %>%
               left_join(x_zhaoetal$samples),
             colour = "Genotype", 
             shape = "Sex", 
             size = 4
    ) +
    theme_bw() +
    scale_color_viridis_d(option = "plasma", end = 0.8) +
    theme(legend.position = "left"),
  
  x_zhaoetal %>% 
    cpm(log = TRUE) %>% 
    t() %>% 
    prcomp() %>%
    autoplot(data = tibble(sample = rownames(.$x)) %>%
               left_join(x_zhaoetal$samples),
             colour = "dateBirth", 
             shape = "Genotype", 
             size = 4
    ) +
    theme_bw() +
    scale_color_futurama() +
    # scale_color_manual(values = brewer.pal(name = "Paired", 
    #                                        n = 11))+
    theme(legend.position = "right") +
    labs(colour = "Litter"), 
  common.legend = F, 
  labels = "AUTO"
)

 # ggsave("plotsforpub/PCA_all.png", width = 18, height = 8, units = "cm", scale = 1.4)

 x_zhaoetal %>% 
    cpm(log = TRUE) %>% 
    t() %>% 
    prcomp() %>%
    autoplot(data = tibble(sample = rownames(.$x)) %>%
               left_join(x_zhaoetal$samples),
             colour = "lane", 
             shape = "Genotype", 
             size = 4
    ) +
    theme_bw() +
    scale_color_d3()

    theme(legend.position = "left")
## List of 1
##  $ legend.position: chr "left"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE
allsampHM <- cpm(x_zhaoetal, log = TRUE) %>% 
  t() %>% 
  dist(method = "euclidean") %>% 
  as.matrix()

allsampHM_anno <-  tibble(
  sample = colnames(allsampHM)) %>% 
  left_join(x_zhaoetal$samples) %>% 
  dplyr::select(sample, Genotype, Sex , litter = dateBirth) %>% 
  column_to_rownames("sample")

newCols <- colorRampPalette(brewer.pal(name = "Paired", 
                                       n = length(unique(allsampHM_anno$litter))))
mycolors <- newCols(length(unique(allsampHM_anno$litter)))
names(mycolors) <- unique(allsampHM_anno$litter)


mycolors <- list(litter = mycolors, 
                 Sex = c("male" = "blue", "female" = "red"), 
                 Genotype = c("APOE3" = "black", "APOE2" = "grey75", "APOE4" = "grey35"))  

#png("plotsforpub/sampleheatmap.png", width = 1000, height = 1000, res = 100)
pheatmap(allsampHM, 
         color = viridis_pal()(1000), 
         annotation_col = allsampHM_anno, 
         annotation_colors = mycolors,
         treeheight_row = 0, 
         cellwidth = 8, 
         cellheight = 8, 
         fontsize_row = 7, 
         fontsize_col = 7
  )

#dev.off()

Library sizes

x_zhaoetal$samples %>% 
  ggplot(aes(x = sample, y = lib.size, fill = Genotype)) +
  geom_col() +
  facet_wrap(~Genotype, scales = "free_x") +
  theme_bw() +
  easy_rotate_x_labels(angle = 315) 

chi squre tests for independence

table(x_zhaoetal$samples$Genotype, x_zhaoetal$samples$dateBirth) %>% 
  chisq.test()
## 
##  Pearson's Chi-squared test
## 
## data:  .
## X-squared = 82.667, df = 20, p-value = 1.378e-09
# males only
table(x_zhaoetal$samples %>% 
        dplyr::filter(Sex == "male") %>% 
        .$Genotype,
      x_zhaoetal$samples %>% 
        dplyr::filter(Sex == "male") %>% 
        .$dateBirth
) %>% 
  chisq.test()
## 
##  Pearson's Chi-squared test
## 
## data:  .
## X-squared = 43.125, df = 14, p-value = 8.182e-05
## females only
table(x_zhaoetal$samples %>% 
        dplyr::filter(Sex == "female") %>% 
        .$Genotype,
      x_zhaoetal$samples %>% 
        dplyr::filter(Sex == "female") %>% 
        .$dateBirth
) %>% 
  chisq.test()
## 
##  Pearson's Chi-squared test
## 
## data:  .
## X-squared = 39.286, df = 14, p-value = 0.0003294

~~ DE analysis initial

design and contrasts matrix

design <- model.matrix(~0 + group,
                       data = x_zhaoetal$samples) %>% 
  set_colnames(gsub(pattern = "group",
                    replacement = "",
                    x = colnames(.)))

contrasts <- makeContrasts(
  levels = colnames(design), 
  
  `Female 2v3` = APOE2_3M_female - APOE3_3M_female,
  `Male 2v3` = APOE2_3M_male - APOE3_3M_male,
  `Female 4v3` = APOE4_3M_female - APOE3_3M_female,
  `Male 4v3` = APOE4_3M_male - APOE3_3M_male
  )

fit

fit_1_zhaoetal <- x_zhaoetal %>% 
  glmFit(design)

# call the toptables

topTables_glm_zhaoetal <- colnames(contrasts) %>%
  sapply(function(x){
    glmLRT(fit_1_zhaoetal, contrast = contrasts[,x]) %>%
      topTags(n = Inf) %>%
      .[["table"]] %>%
      as.data.frame() %>% 
      rownames_to_column("gene_id") %>% 
      as_tibble() %>%
      arrange(PValue) %>%
      mutate(
        comparison = x,
        DE = FDR < 0.05) %>% 
      dplyr::select(gene_name, logFC, logCPM, PValue, FDR, description, everything())
  }, simplify = FALSE)

Vis

topTables_glm_zhaoetal %>% 
  lapply(function(x){
    x %>%
      mutate(direction = case_when(
        sign(logFC) == "1" ~ "up",
        sign(logFC) == "-1" ~ "down"
      ))
  }) %>%
  bind_rows() %>%
  dplyr::filter(DE == TRUE) %>% 
  mutate(Sex = case_when(
    grepl(comparison, pattern = "Male") ~ "male", 
    grepl(comparison, pattern = "Female") ~ "female", 
  ), 
  comparison = case_when(
    grepl(comparison, pattern = "4v3") ~ "APOE4 v APOE3", 
    grepl(comparison, pattern = "2v3") ~ "APOE2 v APOE3", 
  )) %>% 
  ggplot(aes(y = `comparison`)) +
  geom_bar(stat = "count",
           width = 0.5,
           aes(fill = `direction`),
           position = "dodge") +
  easy_rotate_x_labels(angle = 315) +
  theme_bw() +
  scale_fill_npg() +
  theme(legend.position = "bottom") +
  ggtitle("Number of DE genes in each comparison") +
  labs(x = "Number of DE genes",
       y = "Contrast",
       colour = "Direction of change") +
  facet_wrap(~Sex, scales = "free_y")

  #ggsave("plotsforpub/initialDe.png", width = 10, height = 5, units = "cm", dpi = 800, scale = 2)
  

topTables_glm_zhaoetal %>% 
  bind_rows() %>% 
  ggplot(aes(x = logFC, y = -log10(PValue))) +
  geom_point(aes(colour = DE)) +
  facet_wrap(~ comparison, scales = "free") +
  theme_bw() +
  scale_color_manual(values = c("grey50", "red"))

topTables_glm_zhaoetal %>% 
  bind_rows() %>% 
  ggplot(aes(x = logCPM, y = logFC)) +
  geom_point(aes(colour = DE), alpha = 0.5) +
  facet_wrap(~ comparison, scales = "free") +
  scale_color_manual(values = c("grey50", "red")) +
  theme_bw()

ggarrange(topTables_glm_zhaoetal %>% 
  bind_rows() %>% 
  mutate(rankstat = sign(logFC)*-log10(PValue)) %>% 
  ggplot(aes(x = meanGC, y = rankstat)) +
  geom_point(alpha = 0.5, 
             aes(colour = DE)) +
  coord_cartesian(ylim = c(-20, 20)) +
  facet_wrap(~comparison, scales = "free") +
  scale_color_manual(values = c("grey50", "red")) +
  theme_bw() + 
    labs(x = "Weighted average %GC per gene", 
         y = "sign(logFC)*-log10(PValue)", 
         colour = "Differentially expxressed?") +
  geom_smooth(se = F),

topTables_glm_zhaoetal %>% 
  bind_rows() %>% 
  mutate(rankstat = sign(logFC)*-log10(PValue)) %>% 
  ggplot(aes(x = len, y = rankstat)) +
  geom_point(alpha = 0.5, 
             aes(colour = DE)) +
  coord_cartesian(ylim = c(-20, 20)) +
  scale_x_log10() +
  facet_wrap(~ comparison, scales = "free") +
  scale_color_manual(values = c("grey50", "red")) +
  theme_bw() + 
      labs(x = "Average transcript length per gene ", 
         y = "sign(logFC)*-log10(PValue)", 
         colour = "Differentially expxressed?") +
  geom_smooth(se = F), 
labels = c("B", "C"), 
nrow = 1,
common.legend = TRUE
) 

  #ggsave("plotsforpub/GC_len.png", width = 18, height = 8, units = "cm", dpi = 800, scale = 2)

~~ cqn ~~

Since I observe bias for GC content and length for this dataset, cqn normalisation <Hansen et al, 2012> link is warranted. cqn is a normalisation technique which produces an offset for each gene, which corrects for GC content and length. This offset is compatible with edgeR, so I will perform the cqn normalisation then use this offset with the edgeR generalised linear model capabilities (negative binomial distribution) and liklihood ratio tests for differential expression.

cqn_zhaoetal <-
  x_zhaoetal %>% 
  with(
    cqn(
      counts = counts,
      x = genes$meanGC,
      lengths = genes$len,
      sizeFactors = samples$lib.size
    )
  )

plot out the model fits

par(mfrow = c(1, 2))
cqnplot(cqn_zhaoetal, n = 1, xlab = "GC Content")
cqnplot(cqn_zhaoetal, n = 2, xlab = "Length")
*Model fits for GC content and gene length under the CQN model. Variability is clearly visible, particularly for length*

Model fits for GC content and gene length under the CQN model. Variability is clearly visible, particularly for length

par(mfrow = c(1, 1))

PCA

PCA analysis was repeated. Similar clustering of samples was observed across PC1. However, samples appeared closer together.

logCPM_zhaeoetal <- cqn_zhaoetal$y + cqn_zhaoetal$offset

ggarrange(
  logCPM_zhaeoetal %>%
    t() %>%
    prcomp() %>% 
    autoplot(data = tibble(sample = rownames(.$x)) %>%
               left_join(x_zhaoetal$samples),
             colour = "Genotype", 
             shape = "Sex", 
             size = 4
    ) +
    theme_bw() +
    scale_color_viridis_d(option = "plasma", end = 0.8) +
    theme(legend.position = "left"),
  
  logCPM_zhaeoetal %>% 
    t() %>% 
    prcomp() %>%
    autoplot(data = tibble(sample = rownames(.$x)) %>%
               left_join(x_zhaoetal$samples),
             colour = "dateBirth", 
             shape = "Genotype", 
             size = 4
    ) +
    theme_bw() +
    scale_color_futurama() +
    theme(legend.position = "right") +
    labs(colour = "Litter"), 
  common.legend = F, 
  labels = "AUTO"
) 

#ggsave("plotsforpub/PCA_cqn.png", width = 18, height = 8, units = "cm", scale = 1.4)

DE with cqn

# Add the offset to the dge object 
x_zhaoetal$offset <- cqn_zhaoetal$glm.offset

# fit the glm
fit_cqn <- glmFit(x_zhaoetal, design)

# call the top table
toptables_cqn <-  colnames(contrasts) %>%
  sapply(function(x){
    glmLRT(fit_cqn, contrast = contrasts[,x]) %>%
      topTags(n = Inf) %>%
      .[["table"]] %>%
      as.data.frame() %>%
      rownames_to_column("gene_id") %>%
      as_tibble() %>%
      arrange(PValue) %>%
      mutate(
        coef = x,
        pbonf = p.adjust(PValue, "bonferroni"),
        DE = FDR < 0.05) %>%
      dplyr::select(gene_name, logFC, logCPM, PValue, pbonf, FDR, description, everything())
  }, simplify = FALSE)

vis

toptables_cqn %>% 
  lapply(function(x){
    x %>%
      mutate(Direction = case_when(
        sign(logFC) == "1" ~ "up",
        sign(logFC) == "-1" ~ "down"
      ))
  }) %>%
  bind_rows() %>%
  mutate(Sex = case_when(
    grepl(coef, pattern = "Male") ~ "male", 
    grepl(coef, pattern = "Female") ~ "female", 
  ), 
  comparison = case_when(
    grepl(coef, pattern = "4v3") ~ "APOE4 v APOE3", 
    grepl(coef, pattern = "2v3") ~ "APOE2 v APOE3", 
  )) %>% 
  dplyr::filter(DE == TRUE) %>%
  ggplot(aes(y = `comparison`)) +
  geom_bar(stat = "count",
           width = 0.5,
           aes(fill = Direction),
           position = "dodge") +
  easy_rotate_x_labels(angle = 315) +
  theme_bw() +
  scale_fill_npg() +
  theme(legend.position = "bottom") +
  ggtitle("Number of DE genes in each comparison") +
  labs(x = "Number of DE genes",
       y = "Contrast",
       colour = "Direction of change") +
  facet_wrap(~Sex, scales = "free_y") 

  #ggsave("plotsforpub/cqn_noDEgenes.png", width = 10, height = 5, units = "cm", dpi = 800, scale = 2)

ggarrange(
ggarrange(toptables_cqn %>% 
  bind_rows() %>% 
  mutate(rankstat = sign(logFC)*-log10(PValue)) %>% 
  ggplot(aes(x = meanGC, y = rankstat)) +
  geom_point(alpha = 0.5, 
             aes(colour = DE)) +
  coord_cartesian(ylim = c(-20, 20)) +
  facet_wrap(~coef, scales = "free") +
  scale_color_manual(values = c("grey50", "red")) +
  theme_bw() + 
    labs(x = "Weighted average %GC per gene", 
         y = "sign(logFC)*-log10(PValue)", 
         colour = "Differentially expxressed?") +
  geom_smooth(se = F),

toptables_cqn %>% 
  bind_rows() %>% 
  mutate(rankstat = sign(logFC)*-log10(PValue)) %>% 
  ggplot(aes(x = len, y = rankstat)) +
  geom_point(alpha = 0.5, 
             aes(colour = DE)) +
  coord_cartesian(ylim = c(-20, 20)) +
  scale_x_log10() +
  facet_wrap(~ coef, scales = "free") +
  scale_color_manual(values = c("grey50", "red")) +
  theme_bw() + 
      labs(x = "Average transcript length per gene ", 
         y = "sign(logFC)*-log10(PValue)", 
         colour = "Differentially expxressed?") +
  geom_smooth(se = F), 
labels = c("B", "C"), 
nrow = 1,
common.legend = TRUE
) ,  
ggarrange(
toptables_cqn %>% 
  bind_rows() %>% 
   mutate(Sex = case_when(
    grepl(coef, pattern = "Male") ~ "male", 
    grepl(coef, pattern = "Female") ~ "female", 
  ), 
  comparison = case_when(
    grepl(coef, pattern = "4v3") ~ "APOE4 v APOE3", 
    grepl(coef, pattern = "2v3") ~ "APOE2 v APOE3", 
  )) %>% 
  ggplot(aes(x = logFC, y = -log10(PValue))) +
  geom_point(aes(colour = DE), alpha = 0.5) +
  facet_wrap(~Sex+comparison, scales = "free_y") +
  theme_bw() +
  scale_y_continuous(limits = c(0, 20)) +
  scale_x_continuous(limits = c(-2, 2)) +
  labs(colour = "Differentially expressed?") +
  scale_color_manual(values = c("grey50", "red")),

toptables_cqn %>% 
  bind_rows() %>% 
   mutate(Sex = case_when(
    grepl(coef, pattern = "Male") ~ "male", 
    grepl(coef, pattern = "Female") ~ "female", 
  ), 
  comparison = case_when(
    grepl(coef, pattern = "4v3") ~ "APOE4 v APOE3", 
    grepl(coef, pattern = "2v3") ~ "APOE2 v APOE3", 
  )) %>% 
  ggplot(aes(x = logCPM, y = logFC)) +
  geom_point(aes(colour = DE), alpha = 0.5) +
  facet_wrap(~Sex+comparison, scales = "free_y") +
  theme_bw() +
  scale_y_continuous(limits = c(-2, 2)) +
  labs(colour = "Differentially expressed?") +
  scale_color_manual(values = c("grey50", "red")),
labels = c("D", "E"), 
common.legend = TRUE
) , 
ncol = 1, 
common.legend = TRUE
) 

  #ggsave("plotsforpub/GC_lenandMDvol_cqn.png", width = 18, height = 20, units = "cm", dpi = 800, scale = 2)

~~ Enrichment testing within the DE genes using goseq:

gene sets

First I will import the gene sets I will use in this analysis.

KEGG <- msigdbr("Mus musculus", category = "C2", subcategory = "CP:KEGG")  %>% 
  left_join(gc_len %>% 
              dplyr::select(entrez_gene = entrezid) %>% 
              as.data.frame() %>% 
              rownames_to_column("gene_id") %>% 
              as_tibble() %>% 
              unnest()
            ) %>%
  dplyr::filter(gene_id %in% rownames(x_zhaoetal)) %>%
  distinct(gs_name, gene_id, .keep_all = TRUE) %>% 
  split(f = .$gs_name) %>%
  lapply(extract2, "gene_id")

ireGenes <- 
  read_excel("data/ireGenes.xlsx", sheet = "Mouse IREs") %>% 
    gather(key = "ire", value = "gene_id") %>% 
    left_join(x_zhaoetal$genes %>% 
                rownames_to_column("gene_id")) %>% 
    dplyr::filter(gene_name %in% toptables_cqn$`Female 2v3`$gene_name) %>% 
    split(f = .$ire) %>% 
    lapply(magrittr::extract2, "gene_id")

make the pwfs

The avergae transcript length per gene was used to calculate the pwf.

pwfs <- toptables_cqn %>% 
  lapply(function(x) {
    x %>%
      dplyr::select(gene_id, DE, len) %>%
      distinct(gene_id, .keep_all = TRUE) %>% 
      with(
        nullp(
          DEgenes = structure(
            as.integer(DE), names = gene_id
          ), 
          genome = "mm9", 
          id = "ensGene", 
          bias.data = len,
          plot.fit = FALSE
        )
      )
  })

One large enrichment test was performed on the overall gene sets

goseq_res <- pwfs %>% 
  lapply(function(x) {
    goseq(x, gene2cat = c(KEGG, ireGenes)) %>% 
      as_tibble %>%
      dplyr::filter(numDEInCat > 0) %>%
      mutate(FDR = p.adjust(over_represented_pvalue, method = "fdr")) %>% 
      dplyr::select(-under_represented_pvalue) 
  }) 

goseq_res$`Female 2v3` %<>% mutate(coef = "Female 2v3")
goseq_res$`Male 2v3` %<>% mutate(coef = "Male 2v3")
goseq_res$`Female 4v3` %<>% mutate(coef = "Female 4v3")
goseq_res$`Male 4v3` %<>% mutate(coef = "Male 4v3")

results

goseq_res$`Female 2v3` %>% 
  head(5) %>% 
  kable(caption = "Top 5 over-represented KEGG or IRE gene sets in female APOE2 mice") %>% 
  kable_styling()
Top 5 over-represented KEGG or IRE gene sets in female APOE2 mice
category over_represented_pvalue numDEInCat numInCat FDR coef
KEGG_STEROID_BIOSYNTHESIS 0.0000001 9 14 0.0000094 Female 2v3
KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION 0.0049890 7 36 0.3966228 Female 2v3
KEGG_ADHERENS_JUNCTION 0.0133192 11 69 0.7059188 Female 2v3
KEGG_GLYCOSAMINOGLYCAN_DEGRADATION 0.0281626 4 16 0.8962891 Female 2v3
KEGG_RETINOL_METABOLISM 0.0370935 3 13 0.8962891 Female 2v3
goseq_res$`Female 4v3` %>% 
  head(5) %>% 
  kable(caption = "Top 5 over-represented KEGG or IRE gene sets in female APOE4 mice") %>% 
  kable_styling()
Top 5 over-represented KEGG or IRE gene sets in female APOE4 mice
category over_represented_pvalue numDEInCat numInCat FDR coef
KEGG_PYRUVATE_METABOLISM 0.0009277 8 33 0.0794708 Female 4v3
KEGG_ARGININE_AND_PROLINE_METABOLISM 0.0010254 9 41 0.0794708 Female 4v3
KEGG_SPLICEOSOME 0.0050944 16 123 0.2632109 Female 4v3
KEGG_GLYCOLYSIS_GLUCONEOGENESIS 0.0113565 7 39 0.4400631 Female 4v3
KEGG_AMINOACYL_TRNA_BIOSYNTHESIS 0.0159908 7 41 0.4571839 Female 4v3
goseq_res$`Male 2v3` %>% 
  head(5) %>% 
  kable(caption = "Top 5 over-represented KEGG or IRE gene sets in male APOE2 mice") %>% 
  kable_styling()
Top 5 over-represented KEGG or IRE gene sets in male APOE2 mice
category over_represented_pvalue numDEInCat numInCat FDR coef
KEGG_RENAL_CELL_CARCINOMA 0.0418446 5 69 0.9237325 Male 2v3
KEGG_NON_SMALL_CELL_LUNG_CANCER 0.0528538 4 51 0.9237325 Male 2v3
All 3’ IREs 0.0585138 69 1995 0.9237325 Male 2v3
KEGG_VEGF_SIGNALING_PATHWAY 0.0844445 4 61 0.9237325 Male 2v3
KEGG_GRAFT_VERSUS_HOST_DISEASE 0.0888479 1 4 0.9237325 Male 2v3
goseq_res$`Male 4v3` %>% 
  head(5) %>% 
  kable(caption = "Top 5 over-represented KEGG or IRE gene sets in male APOE4 mice") %>% 
  kable_styling()
Top 5 over-represented KEGG or IRE gene sets in male APOE4 mice
category over_represented_pvalue numDEInCat numInCat FDR coef
KEGG_CALCIUM_SIGNALING_PATHWAY 0.0001408 61 136 0.0261886 Male 4v3
KEGG_NEUROACTIVE_LIGAND_RECEPTOR_INTERACTION 0.0003786 64 149 0.0352067 Male 4v3
KEGG_TASTE_TRANSDUCTION 0.0011676 13 20 0.0723918 Male 4v3
KEGG_VASCULAR_SMOOTH_MUSCLE_CONTRACTION 0.0032600 38 89 0.1515881 Male 4v3
KEGG_RIBOSOME 0.0066362 28 82 0.2104138 Male 4v3
sig.genesets.goseq <- goseq_res %>% 
  bind_rows() %>% 
  dplyr::filter(FDR < 0.05) %>% .$category

anno_hm <- 
  toptables_cqn %>% 
  bind_rows() %>% 
  dplyr::filter(gene_id %in% c(KEGG$KEGG_CALCIUM_SIGNALING_PATHWAY)
  ) %>% 
  dplyr::select(gene_name, DE, coef) %>% 
  mutate(DE = as.character(DE)) %>% 
  spread(key = "coef", value = "DE") %>% 
  column_to_rownames("gene_name") 

toptables_cqn %>% 
  bind_rows() %>% 
  dplyr::filter(gene_id %in% KEGG$KEGG_CALCIUM_SIGNALING_PATHWAY) %>% 
  dplyr::select(gene_name, logFC, coef) %>% 
  spread(key = "coef", value = "logFC") %>% 
  column_to_rownames("gene_name") %>% 
  t() %>% 
  pheatmap(color = colorRampPalette(rev(brewer.pal(name = "RdBu", n = 5)))(100), 
           breaks = c(seq(min(.), 0, length.out=ceiling(100/2) + 1), 
                      seq(max(.)/100, max(.), length.out=floor(100/2))), 
           # cellwidth = 7, 
           # cellheight = 7, 
           fontsize = 7, 
           annotation_col = anno_hm, 
           annotation_colors = list(
             `Male 4v3` = c("FALSE" = "grey50", "TRUE" = "green"),
             `Female 4v3` = c("FALSE" = "grey50", "TRUE" = "green"), 
             `Male 2v3` = c("FALSE" = "grey50", "TRUE" = "green"),
             `Female 2v3` = c("FALSE" = "grey50", "TRUE" = "green")
           )
  )

#png("plotsforpub/upsetAPOE4DEgenes.png", width = 5000, height = 2500, res = 1000)
list(
  KEGG_NEUROACTIVE_LIGAND_RECEPTOR_INTERACTION = toptables_cqn$`Male 4v3` %>% 
    bind_rows() %>% 
    dplyr::filter(DE == TRUE, 
                  gene_id %in% KEGG$KEGG_NEUROACTIVE_LIGAND_RECEPTOR_INTERACTION) %>% 
    .$gene_name, 
  KEGG_CALCIUM_SIGNALING_PATHWAY = toptables_cqn$`Male 4v3` %>% 
    bind_rows() %>% 
    dplyr::filter(DE == TRUE, 
                  gene_id %in% KEGG$KEGG_CALCIUM_SIGNALING_PATHWAY) %>% 
    .$gene_name
 ) %>% 
  fromList() %>% 
  upset(order.by = 'freq', nsets = length(.))

#dev.off()

KEGG ribo pheatmap

#png("plotsforpub/kegg_ribo.png", width = 35, height = 20, units = "cm", res = 800)
toptables_cqn %>% 
  bind_rows() %>% 
  dplyr::filter(gene_id %in% KEGG$KEGG_RIBOSOME) %>%
  dplyr::select(gene_name, coef, logFC) %>% 
  spread(key = "coef", value = logFC) %>% 
  column_to_rownames("gene_name") %>% 
  t() %>% 
  pheatmap(color = colorRampPalette(rev(brewer.pal(n = 7, name =
                                                     "RdBu")))(100), 
           breaks = c(seq(min(.), 0, length.out=ceiling(100/2) + 1), 
                      seq(max(.)/100, max(.), length.out=floor(100/2))), 
           annotation_row = tibble(comparison = rownames(.)) %>% 
             mutate(Sex = case_when(
               grepl(comparison, pattern = "Male") ~ "male", 
               grepl(comparison, pattern = "Female") ~ "female", 
             ), 
             APOE = case_when(
               grepl(comparison, pattern = "4v3") ~ "APOE4 v APOE3", 
               grepl(comparison, pattern = "2v3") ~ "APOE2 v APOE3", 
             )) %>% 
             column_to_rownames("comparison"), 
           annotation_col = toptables_cqn %>% 
             bind_rows() %>% 
             dplyr::filter(gene_id %in% KEGG$KEGG_RIBOSOME) %>% 
             dplyr::select(gene_name, DE, coef) %>% 
             mutate(coef = str_replace(coef, pattern = "2", replacement = "APOE2"), 
                    coef = str_replace(coef, pattern = "3", replacement = "APOE3"), 
                    coef = str_replace(coef, pattern = "4", replacement = "APOE4"), 
                    coef =  str_replace(coef, pattern = "v", replacement = " v "), 
                    DE = as.character(DE)
             ) %>% 
             spread(key = "coef", value = "DE") %>% 
             column_to_rownames("gene_name"),
           annotation_colors = list("Male APOE4 v APOE3" = c("FALSE" = "grey60", 
                                                        "TRUE" = "green2"), 
                                    "Male APOE2 v APOE3" = c("FALSE" = "grey60", 
                                                        "TRUE" = "green2"), 
                                    "Female APOE4 v APOE3" = c("FALSE" = "grey60", 
                                                        "TRUE" = "green2"), 
                                    "Female APOE2 v APOE3" = c("FALSE" = "grey60", 
                                                        "TRUE" = "green2"), 
                                    APOE = c( "APOE2 v APOE3" = "purple", 
                                              "APOE4 v APOE3" = "orange"), 
                                    Sex = c(male = "black", 
                                            female = "grey50")
                                    ),
           cellheight = 15, 
           legend_breaks = c(seq(-.2, 0.2, by = 0.1), max(.)),
           legend_labels = c(seq(-.2, 0.2, by = 0.1), "logFC"),
           border_color = NA, 
           main = "KEGG_RIBOSOME", 
           fontsize_col= 5
           
  )

#dev.off()

ranked lists using fry, camera and fgsea

fryRes <- colnames(contrasts) %>% 
  sapply(function(x){
    logCPM_zhaeoetal %>% 
      fry(index = c(KEGG, ireGenes),
          design = design,
          contrast= contrasts[,x],
          sort = "directional"
      ) %>% 
      as.data.frame() %>% 
      rownames_to_column("geneset") %>% 
      as_tibble() %>% 
      mutate(pbonf = p.adjust(PValue, "bonferroni"), 
             coef = x)
  }, simplify = FALSE
  )


camera_res <- colnames(contrasts) %>% 
 sapply(function(x){
    logCPM_zhaeoetal %>% 
      camera(index = c(KEGG, ireGenes),
          design = design,
          contrast= contrasts[,x]
      ) %>% 
      as.data.frame() %>% 
      rownames_to_column("geneset") %>% 
      as_tibble() %>% 
      mutate(pbonf = p.adjust(PValue, "bonferroni"), 
             coef = x)
  }, simplify = FALSE
  )


# GSEA

# create a rank stat for fgsea
ranks <- 
  sapply(toptables_cqn, function(x) {
    x %>% 
      mutate(PValue_withsign = sign(logFC) * log10(1/PValue)) %>% 
      arrange(PValue_withsign) %>% 
      dplyr::select(c("gene_id", "PValue_withsign")) %>% #only want the Pvalue with sign
      with(structure(PValue_withsign, names = gene_id)) %>% 
      rev() # reverse so the start of the list is upregulated genes
  }, simplify = FALSE)

# Run fgsea
# This takes a while due to the number of permutations
# set a seed for a reproducible result
set.seed(33)
fgsea_res <- ranks %>%
  lapply(function(x){
    fgseaMultilevel(stats = x, pathways = c(KEGG, ireGenes)) %>%
      as_tibble() %>%
      dplyr::rename(FDR = padj) %>%
      mutate(padj = p.adjust(pval, "bonferroni")) %>%
      dplyr::select(c(geneset = pathway), pval, FDR, padj, everything()) %>%
      arrange(pval) %>%
      mutate(sig = padj < 0.05)
  })

fgsea_res$`Female 2v3` %<>% mutate(coef = "Female 2v3")
fgsea_res$`Male 2v3` %<>% mutate(coef = "Male 2v3")
fgsea_res$`Female 4v3` %<>% mutate(coef = "Female 4v3")
fgsea_res$`Male 4v3` %<>% mutate(coef = "Male 4v3")


# harmonic mean 
HMP <- 
  fryRes %>% 
  bind_rows() %>% 
  dplyr::select(geneset, PValue, coef) %>% 
  dplyr::rename(fry_p = PValue) %>% 
  left_join(camera_res %>% 
              bind_rows() %>% 
              dplyr::select(geneset, PValue, coef), 
            by = c("geneset", "coef")) %>% 
  dplyr::rename(camera_p = PValue) %>% 
  left_join(fgsea_res %>% 
              bind_rows() %>% 
              dplyr::select(geneset, pval, coef), 
            by = c("geneset", "coef")) %>% 
  dplyr::rename(fgsea_p = pval) %>% 
  bind_rows() %>% 
  nest(p = one_of(c("fry_p", "camera_p", "fgsea_p"))) %>% 
  mutate(
    harmonic_p = vapply(p, function(x){
      x <- unlist(x)
      x <- x[!is.na(x)]
      p.hmp(x, L = 3)
    }, numeric(1))
  ) %>% 
  unnest() %>% 
  mutate(harmonic_p_FDR = p.adjust(harmonic_p, "fdr"), 
         sig = harmonic_p_FDR < 0.05) %>% 
  arrange(harmonic_p_FDR)

vis

Number of gene sets

HMP %>% 
  dplyr::filter(harmonic_p_FDR < 0.01) %>% #group_by(coef) %>% summarise(n = n())
  ggplot(aes(coef)) +
  geom_bar() +
  theme_bw()

HMP %>% 
  mutate(Sex = case_when(
    grepl(coef, pattern = "Male") ~ "male", 
    grepl(coef, pattern = "Female") ~ "female", 
  ), 
  comparison = case_when(
    grepl(coef, pattern = "4v3") ~ "APOE4 v APOE3", 
    grepl(coef, pattern = "2v3") ~ "APOE2 v APOE3", 
  )) %>% 
  dplyr::filter(harmonic_p_FDR < 0.01) %>% 
  ggplot(aes(y= reorder(geneset, -harmonic_p), 
             x = coef, 
             fill= -log10(harmonic_p))) +
  geom_tile(colour = "black") +
  theme_bw() +
  facet_wrap(~Sex+comparison, scales = "free") +
  scale_fill_viridis_c() +
  easy_rotate_x_labels(angle = -45) +
  labs(x = "", 
       y = "Gene set") +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) 

  #ggsave("plotsforpub/HMP.png", width = 21.5, height = 10, units = "cm", dpi = 600, scale = 2)

upset DE genes

upset4v3male <- c(KEGG, ireGenes) %>% 
  .[c(HMP %>%
        dplyr::filter(coef == "Male 4v3" & harmonic_p_FDR < 0.01) %>%
        .$geneset)] %>% 
  lapply(function(x) {
    x %>% 
      intersect(toptables_cqn$`Male 4v3` %>% 
                  dplyr::filter(DE == TRUE) %>% 
                  .$gene_id)
  }) %>% 
  fromList() %>% 
  upset(nsets = length(.), 
        order.by = "freq"
  )

# convert to cowplot 
upset4v3male_cow <- cowplot::plot_grid(NULL, upset4v3male$Main_bar, 
                                   upset4v3male$Sizes, upset4v3male$Matrix,
                            nrow=2, align='hv', rel_heights = c(1,1),
                           rel_widths = c(1,2))

## female 4v3
upset4v3female <- c(KEGG, ireGenes) %>% 
  .[c(HMP %>%
        dplyr::filter(coef == "Female 4v3" & harmonic_p_FDR < 0.01) %>%
        .$geneset)] %>% 
  lapply(function(x) {
    x %>% 
      intersect(toptables_cqn$`Female 4v3` %>% 
                  dplyr::filter(DE == TRUE) %>% 
                  .$gene_id)
  }) %>% 
  fromList() %>% 
  upset(nsets = length(.), 
        order.by = "freq"
  )

# convert to cowplot 
upset4v3fem_cow <- cowplot::plot_grid(NULL, upset4v3female$Main_bar, 
                                      upset4v3female$Sizes, 
                                      upset4v3female$Matrix,
                                      nrow=2, align='hv', rel_heights = c(1,1),
                                      rel_widths = c(1,2))

## female 2v3
upset2v3female <- c(KEGG, ireGenes) %>% 
  .[c(HMP %>%
        dplyr::filter(coef == "Female 2v3" & harmonic_p_FDR < 0.01) %>%
        .$geneset)] %>% 
  lapply(function(x) {
    x %>% 
      intersect(toptables_cqn$`Female 2v3` %>% 
                  dplyr::filter(DE == TRUE) %>% 
                  .$gene_id)
  }) %>% 
  fromList() %>% 
  upset(nsets = length(.), 
        order.by = "freq"
  )

# convert to cowplot 
upset2v3fem_cow <- cowplot::plot_grid(NULL, upset2v3female$Main_bar, 
                                      upset2v3female$Sizes, 
                                      upset2v3female$Matrix,
                                      nrow=2, align='hv', rel_heights = c(1,1),
                                      rel_widths = c(1,2))

upsets <- grid.arrange(upset4v3male_cow) 

  # ggsave(upsets, filename = "plotsforpub/upset_HMP_degenes.png", 
  #        width = 18, height = 10, units = "cm", dpi = 600, scale = 2)
toptables_cqn %>% 
    bind_rows() %>% 
    mutate(rankstat = sign(logFC)*-log10(PValue)) %>% 
    ggplot(aes(x = len, y = rankstat)) +
    geom_point(alpha = 0.5, 
               aes(colour = DE)) +
    coord_cartesian(ylim = c(-20, 20)) +
    facet_wrap(~ coef, scales = "free") +
    scale_x_continuous(limits = c(10, 20000), 
                       trans = scales::log10_trans(), ) +
    scale_color_manual(values = c("grey50", "red")) +
    theme_bw() + 
    labs(x = "Average transcript length per gene ", 
         y = "sign(logFC)*-log10(PValue)", 
         colour = "Differentially expxressed?") +
    geom_smooth(se = F)

cell type check

cell_type_markers <- read_xls("data/cell_type_genes41586_2007_BFnature05453_MOESM11_ESM.xls") %>% 
  dplyr::select(c("Neurons" = "Neuron-enriched"), c("Astrocytes" = "Astrocyte-enriched"), c("Oligodendrocytes" = "Oligodendrocyte-enriched")) %>% 
  gather("cell_type", "gene_name") %>% 
  left_join(x_zhaoetal$genes %>% 
              as.data.frame %>% 
              rownames_to_column("gene_id") %>% 
              dplyr::select(gene_name, gene_id)) %>% 
  na.omit %>% 
  dplyr::select(-gene_name) %>% 
  split(f = .$cell_type) %>% 
  lapply(function(x) {
    dplyr::select(x, "gene_id") %>% 
      .$gene_id
  })

cell_type_markers$Microglia <-  read_excel("data/microglia_oosterofetal.xlsx",
                                           sheet = 1) %>%
  .$Mouse.Ensembl.Gene.ID
ggarrange(
  colnames(contrasts) %>% 
    sapply(function(x){
      logCPM_zhaeoetal %>% 
        fry(index = cell_type_markers,
            design = design,
            contrast= contrasts[,x],
            #sort = "directional"
        ) %>% 
        as.data.frame() %>% 
        rownames_to_column("geneset") %>% 
        as_tibble() %>% 
        mutate(pbonf = p.adjust(PValue, "bonferroni"), 
               coef = x)
    }, simplify = FALSE
    ) %>% 
    bind_rows() %>% 
    ggplot(aes(x = -log10(PValue), y = geneset)) +
    geom_col(aes(fill = FDR < 0.05)) +
    facet_wrap(~coef, nrow = 1) +
    theme_bw() +
    scale_fill_manual(values = c("grey50", "red")),
  


ggarrange(
x_zhaoetal %>% 
  cpm(log=TRUE) %>%
  as.data.frame() %>%
  rownames_to_column("gene_id") %>%
  dplyr::filter(gene_id %in% cell_type_markers$Neurons) %>%
  gather(key = "sample", value = "logCPM", colnames(x_zhaoetal)) %>%
  left_join(x_zhaoetal$samples) %>% 
  ggplot(aes(x = Genotype, y = logCPM)) +
  geom_violin(trim = FALSE,
    aes(fill = Genotype),
    alpha = 0.75) +
  facet_wrap(~Sex) +
  geom_boxplot(width = .4) +
  stat_summary(geom = "point", shape = 23, fill = "red", size = 3) +
  theme_bw() +
  labs(x = "",
       fill = "Genotype") +
  ggtitle("Neurons") +
  scale_fill_viridis_d(),

x_zhaoetal %>% 
  cpm(log=TRUE) %>%
  as.data.frame() %>%
  rownames_to_column("gene_id") %>%
  dplyr::filter(gene_id %in% cell_type_markers$Astrocytes) %>%
  gather(key = "sample", value = "logCPM", colnames(x_zhaoetal)) %>%
  left_join(x_zhaoetal$samples) %>%
  ggplot(aes(x = Genotype, y = logCPM)) +
  geom_violin(trim = FALSE,
    aes(fill = Genotype),
    alpha = 0.75) +
  facet_wrap(~Sex) +
  geom_boxplot(width = .4) +
  stat_summary(geom = "point", shape = 23, fill = "red", size = 3) +
  theme_bw() +
  labs(x = "",
       fill = "Genotype") +
  ggtitle("Astrocytes") +
  scale_fill_viridis_d(),

x_zhaoetal %>% 
  cpm(log=TRUE) %>%
  as.data.frame() %>%
  rownames_to_column("gene_id") %>%
  dplyr::filter(gene_id %in% cell_type_markers$Oligodendrocytes) %>%
  gather(key = "sample", value = "logCPM", colnames(x_zhaoetal)) %>%
  left_join(x_zhaoetal$samples) %>%
  ggplot(aes(x = Genotype, y = logCPM)) +
  geom_violin(trim = FALSE,
    aes(fill = Genotype),
    alpha = 0.75) +
  facet_wrap(~Sex) +
  geom_boxplot(width = .4) +
  stat_summary(geom = "point", shape = 23, fill = "red", size =3) +
  theme_bw() +
  labs(x = "",
       fill = "Genotype") +
  ggtitle("Oligodendrocytes") +
  scale_fill_viridis_d(),

x_zhaoetal %>% 
  cpm(log=TRUE) %>%
  as.data.frame() %>%
  rownames_to_column("gene_id") %>%
  dplyr::filter(gene_id %in% cell_type_markers$Microglia) %>%
  gather(key = "sample", value = "logCPM", colnames(x_zhaoetal)) %>%
  left_join(x_zhaoetal$samples) %>%
  ggplot(aes(x = Genotype, y = logCPM)) +
  geom_violin(trim = FALSE,
    aes(fill = Genotype),
    alpha = 0.75) +
  facet_wrap(~Sex) +
  geom_boxplot(width = .4) +
  stat_summary(geom = "point", shape = 23, fill = "red", size = 3) +
  theme_bw() +
  labs(x = "",
       fill = "Genotype") +
  ggtitle("Microglia") +
  scale_fill_viridis_d(),
common.legend = TRUE
), 
common.legend = FALSE,
heights = c(1,4), 
ncol = 1, 
labels = "AUTO"
) 

  # ggsave("plotsforpub/celltype.png")

Session info

sessionInfo() %>% pander()

R version 4.0.2 (2020-06-22)

Platform: x86_64-apple-darwin17.0 (64-bit)

locale: en_AU.UTF-8||en_AU.UTF-8||en_AU.UTF-8||C||en_AU.UTF-8||en_AU.UTF-8

attached base packages: stats4, parallel, grid, splines, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: ggsci(v.2.9), pathview(v.1.28.1), ggeasy(v.0.1.2), cowplot(v.1.1.0), RColorBrewer(v.1.1-2), rtracklayer(v.1.48.0), pander(v.0.6.3), kableExtra(v.1.3.1), UpSetR(v.1.4.0), pheatmap(v.1.0.12), ggpubr(v.0.4.0), msigdbr(v.7.2.1), ensembldb(v.2.12.1), AnnotationFilter(v.1.12.0), GenomicFeatures(v.1.40.1), AnnotationDbi(v.1.50.3), Biobase(v.2.48.0), GenomicRanges(v.1.40.0), GenomeInfoDb(v.1.24.2), IRanges(v.2.22.2), S4Vectors(v.0.26.1), AnnotationHub(v.2.20.2), BiocFileCache(v.1.12.1), dbplyr(v.2.0.0), BiocGenerics(v.0.34.0), gridExtra(v.2.3), readxl(v.1.3.1), harmonicmeanp(v.3.0), FMStable(v.0.1-2), scales(v.1.1.1), cqn(v.1.34.0), quantreg(v.5.75), SparseM(v.1.78), preprocessCore(v.1.50.0), nor1mix(v.1.3-0), mclust(v.5.4.7), goseq(v.1.40.0), geneLenDataBase(v.1.24.0), BiasedUrn(v.1.07), ggrepel(v.0.8.2), ggfortify(v.0.4.11), fgsea(v.1.14.0), edgeR(v.3.30.3), limma(v.3.44.3), magrittr(v.2.0.1), forcats(v.0.5.0), stringr(v.1.4.0), dplyr(v.1.0.2), purrr(v.0.3.4), readr(v.1.4.0), tidyr(v.1.1.2), tibble(v.3.0.4), ggplot2(v.3.3.2) and tidyverse(v.1.3.0)

loaded via a namespace (and not attached): tidyselect(v.1.1.0), RSQLite(v.2.2.1), BiocParallel(v.1.22.0), munsell(v.0.5.0), statmod(v.1.4.35), withr(v.2.3.0), colorspace(v.2.0-0), highr(v.0.8), knitr(v.1.30), rstudioapi(v.0.13), ggsignif(v.0.6.0), labeling(v.0.4.2), KEGGgraph(v.1.48.0), GenomeInfoDbData(v.1.2.3), farver(v.2.0.3), bit64(v.4.0.5), vctrs(v.0.3.5), generics(v.0.1.0), xfun(v.0.19), R6(v.2.5.0), locfit(v.1.5-9.4), bitops(v.1.0-6), DelayedArray(v.0.14.1), assertthat(v.0.2.1), promises(v.1.1.1), gtable(v.0.3.0), conquer(v.1.0.2), rlang(v.0.4.9), MatrixModels(v.0.4-1), rstatix(v.0.6.0), lazyeval(v.0.2.2), broom(v.0.7.2), BiocManager(v.1.30.10), yaml(v.2.2.1), abind(v.1.4-5), modelr(v.0.1.8), backports(v.1.2.0), httpuv(v.1.5.4), tools(v.4.0.2), ellipsis(v.0.3.1), Rcpp(v.1.0.5), plyr(v.1.8.6), progress(v.1.2.2), zlibbioc(v.1.34.0), RCurl(v.1.98-1.2), prettyunits(v.1.1.1), openssl(v.1.4.3), SummarizedExperiment(v.1.18.2), haven(v.2.3.1), fs(v.1.5.0), data.table(v.1.13.2), openxlsx(v.4.2.3), reprex(v.0.3.0), ProtGenerics(v.1.20.0), matrixStats(v.0.57.0), hms(v.0.5.3), mime(v.0.9), evaluate(v.0.14), xtable(v.1.8-4), XML(v.3.99-0.5), rio(v.0.5.16), compiler(v.4.0.2), biomaRt(v.2.44.4), crayon(v.1.3.4), htmltools(v.0.5.0), mgcv(v.1.8-33), later(v.1.1.0.1), lubridate(v.1.7.9.2), DBI(v.1.1.0), rappdirs(v.0.3.1), Matrix(v.1.2-18), car(v.3.0-10), cli(v.2.2.0), pkgconfig(v.2.0.3), GenomicAlignments(v.1.24.0), foreign(v.0.8-80), xml2(v.1.3.2), webshot(v.0.5.2), XVector(v.0.28.0), rvest(v.0.3.6), digest(v.0.6.27), graph(v.1.66.0), Biostrings(v.2.56.0), rmarkdown(v.2.5), cellranger(v.1.1.0), fastmatch(v.1.1-0), curl(v.4.3), shiny(v.1.5.0), Rsamtools(v.2.4.0), lifecycle(v.0.2.0), nlme(v.3.1-150), jsonlite(v.1.7.1), carData(v.3.0-4), viridisLite(v.0.3.0), askpass(v.1.1), fansi(v.0.4.1), pillar(v.1.4.7), lattice(v.0.20-41), KEGGREST(v.1.28.0), fastmap(v.1.0.1), httr(v.1.4.2), GO.db(v.3.11.4), interactiveDisplayBase(v.1.26.3), glue(v.1.4.2), zip(v.2.1.1), png(v.0.1-7), BiocVersion(v.3.11.1), bit(v.4.0.4), Rgraphviz(v.2.32.0), stringi(v.1.5.3), blob(v.1.2.1), org.Hs.eg.db(v.3.11.4) and memoise(v.1.1.0)